All Life Bank is a US bank that has a growing customer base. The majority of these customers are liability customers (depositors) with varying sizes of deposits. The number of customers who are also borrowers (asset customers) is quite small, and the bank is interested in expanding this base rapidly to bring in more loan business and in the process, earn more through the interest on loans. In particular, the management wants to explore ways of converting its liability customers to personal loan customers (while retaining them as depositors).
A campaign that the bank ran last year for liability customers showed a healthy conversion rate of over 9% success. This has encouraged the retail marketing department to devise campaigns with better target marketing to increase the success ratio.
You as a Data scientist at AllLife bank have to build a model that will help the marketing department to identify the potential customers who have a higher probability of purchasing the loan.
The classification goal is to predict the likelihood of a liability customer buying personal loans which means we have to build a model which will be used to predict which customer will most likely to accept the offer for personal loan, based on the specific relationship with the bank across various features given in the dataset.
The data contains the important demographic and banking details of the customers.
Attribute Information:
# upgrade dependencies
# !pip install --upgrade pyflakes
# install zipcode library
# !pip install uszipcode
# commented after first install
# this will help in making the Python code more structured automatically (good coding practice)
%load_ext nb_black
# Library to suppress warnings or deprecation notes
import warnings
warnings.filterwarnings("ignore")
# Libraries to help with reading and manipulating data
import pandas as pd
import numpy as np
# Library to split data
from sklearn.model_selection import train_test_split
# libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns
# library to process the zipcode column
import uszipcode as usz
from uszipcode import SearchEngine
# Remove the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# Set the limit for the number of displayed rows
pd.set_option("display.max_rows", 200)
from sklearn import metrics
# Libraries to build Linear Regression Model
from sklearn.linear_model import LogisticRegression
# Libraries to build decision tree classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
# To tune different models
from sklearn.model_selection import GridSearchCV
# To get diferent metric scores
from sklearn.metrics import (
f1_score,
accuracy_score,
recall_score,
precision_score,
confusion_matrix,
plot_confusion_matrix,
make_scorer,
roc_auc_score,
precision_recall_curve,
roc_curve,
)
The nb_black extension is already loaded. To reload it, use: %reload_ext nb_black
# function to extract county from the zipcode using searchengine from uszipcode library
def zip_county (zipcode):
search = SearchEngine(simple_zipcode = True)
zip = search.by_zipcode(zipcode)
return zip.county
# function to extract state from the zipcode using searchengine from uszipcode library
def zip_state (zipcode):
search = SearchEngine(simple_zipcode = True)
zip = search.by_zipcode(zipcode)
return zip.state
# histogram and boxplot for the feature
def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
"""
Boxplot and histogram combined
data: dataframe
feature: dataframe column
figsize: size of figure (default (12,7))
kde: whether to the show density curve (default False)
bins: number of bins for histogram (default None)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(
nrows=2, # Number of rows of the subplot grid= 2
sharex=True, # x-axis will be shared among all subplots
gridspec_kw={"height_ratios": (0.25, 0.75)},
figsize=figsize,
) # creating the 2 subplots
sns.boxplot(
data=data, x=feature, ax=ax_box2, showmeans=True, color="orange"
) # boxplot will be created and a star will indicate the mean value of the column
sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="Winter"
) if bins else sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2
) # For histogram
ax_hist2.axvline(
data[feature].mean(), color="green", linestyle="--"
) # Add mean to the histogram
ax_hist2.axvline(data[feature].median(), color="blue", linestyle="-")
plt.show() # show the plot
# labeled_barplot
def labeled_barplot(data, feature, perc=False, n=None):
"""
Barplot with percentage at the top
data: dataframe
feature: dataframe column
perc: whether to display percentages instead of count (default is False)
n: displays the top n category levels (default is None, i.e., display all levels)
"""
total = len(data[feature]) # length of the column
count = data[feature].nunique()
if n is None:
plt.figure(figsize=(count + 1, 5))
else:
plt.figure(figsize=(n + 1, 5))
plt.xticks(rotation=90, fontsize=15)
ax = sns.countplot(
data=data,
x=feature,
palette="Paired",
order=data[feature].value_counts().index[:n].sort_values(),
)
for p in ax.patches:
if perc == True:
label = "{:.1f}%".format(
100 * p.get_height() / total
) # percentage of each class of the category
else:
label = p.get_height() # count of each level of the category
x = p.get_x() + p.get_width() / 2 # width of the plot
y = p.get_height() # height of the plot
ax.annotate(
label,
(x, y),
ha="center",
va="center",
size=12,
xytext=(0, 5),
textcoords="offset points",
) # annotate the percentage
plt.show() # show the plot
# function to plot stacked bar chart
def stacked_barplot(data, predictor, target):
"""
Print the category counts and plot a stacked bar chart
data: dataframe
predictor: independent variable
target: target variable
"""
count = data[predictor].nunique()
sorter = data[target].value_counts().index[-1]
tab1 = pd.crosstab(data[predictor], data[target], margins=True).sort_values(
by=sorter, ascending=False
)
#print(tab1)
#print("-" * 120)
tab = pd.crosstab(data[predictor], data[target], normalize="index").sort_values(
by=sorter, ascending=False
)
tab.plot(kind="bar", stacked=True, figsize=(count + 5, 6))
plt.legend(
loc="lower left", frameon=False,
)
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
plt.show()
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn_with_threshold(model, predictors, target, threshold=0.5):
"""
Function to compute different metrics, based on the threshold specified, to check classification model performance
model: classifier
predictors: independent variables
target: dependent variable
threshold: threshold for classifying the observation as class 1
"""
# predicting using the independent variables
pred_prob = model.predict_proba(predictors)[:, 1]
pred_thres = pred_prob > threshold
pred = np.round(pred_thres)
acc = accuracy_score(target, pred) # to compute Accuracy
recall = recall_score(target, pred) # to compute Recall
precision = precision_score(target, pred) # to compute Precision
f1 = f1_score(target, pred) # to compute F1-score
# creating a dataframe of metrics
df_perf = pd.DataFrame(
{
"Accuracy": acc,
"Recall": recall,
"Precision": precision,
"F1": f1,
},
index=[0],
)
return df_perf
# defining a function to plot the confusion_matrix of a classification model built using sklearn
def confusion_matrix_sklearn_with_threshold(model, predictors, target, threshold=0.5):
"""
To plot the confusion_matrix, based on the threshold specified, with percentages
model: classifier
predictors: independent variables
target: dependent variable
threshold: threshold for classifying the observation as class 1
"""
pred_prob = model.predict_proba(predictors)[:, 1]
pred_thres = pred_prob > threshold
y_pred = np.round(pred_thres)
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
plt.show()
# functions to treat outliers by flooring and capping
def treat_outliers(df, col):
"""
Treats outliers in a variable
df: dataframe
col: dataframe column
"""
Q1 = df[col].quantile(0.25) # 25th quantile
Q3 = df[col].quantile(0.75) # 75th quantile
IQR = Q3 - Q1
Lower_Whisker = Q1 - 1.5 * IQR
Upper_Whisker = Q3 + 1.5 * IQR
# all the values smaller than Lower_Whisker will be assigned the value of Lower_Whisker
# all the values greater than Upper_Whisker will be assigned the value of Upper_Whisker
df[col] = np.clip(df[col], Lower_Whisker, Upper_Whisker)
return df
#defining a function to treat all outliers in one go
def treat_outliers_all(df, col_list):
"""
Treat outliers in a list of variables
df: dataframe
col_list: list of dataframe columns
"""
for c in col_list:
df = treat_outliers(df, c)
return df
#defining a function to plat precession and recall curves compared to threshold
def plot_prec_recall_vs_tresh(precisions, recalls, thresholds):
plt.plot(thresholds, precisions[:-1], "b--", label="precision")
plt.plot(thresholds, recalls[:-1], "g--", label="recall")
plt.xlabel("Threshold")
plt.legend(loc="upper left")
plt.ylim([0, 1])
plt.show()
#defining a fuunction for confusion matrix without threshold
def confusion_matrix_sklearn(model, predictors, target):
"""
To plot the confusion_matrix with percentages
model: classifier
predictors: independent variables
target: dependent variable
"""
y_pred = model.predict(predictors)
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
plt.show()
#Function to calculate recall score
def get_recall_score(model, predictors, target):
"""
model: classifier
predictors: independent variables
target: dependent variable
"""
prediction = model.predict(predictors)
return recall_score(target, prediction)
# load the file
df = pd.read_csv("loan-modelling.csv")
# back up data to preserve the initial version for reference
df_back = df.copy()
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.") # f-string
There are 5000 rows and 14 columns.
# column list
df.columns
Index(['ID', 'Age', 'Experience', 'Income', 'ZIPCode', 'Family', 'CCAvg',
'Education', 'Mortgage', 'Personal_Loan', 'Securities_Account',
'CD_Account', 'Online', 'CreditCard'],
dtype='object')
# check the dataset information
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5000 entries, 0 to 4999 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 5000 non-null int64 1 Age 5000 non-null int64 2 Experience 5000 non-null int64 3 Income 5000 non-null int64 4 ZIPCode 5000 non-null int64 5 Family 5000 non-null int64 6 CCAvg 5000 non-null float64 7 Education 5000 non-null int64 8 Mortgage 5000 non-null int64 9 Personal_Loan 5000 non-null int64 10 Securities_Account 5000 non-null int64 11 CD_Account 5000 non-null int64 12 Online 5000 non-null int64 13 CreditCard 5000 non-null int64 dtypes: float64(1), int64(13) memory usage: 547.0 KB
# check number of null records
df.isnull().sum()
ID 0 Age 0 Experience 0 Income 0 ZIPCode 0 Family 0 CCAvg 0 Education 0 Mortgage 0 Personal_Loan 0 Securities_Account 0 CD_Account 0 Online 0 CreditCard 0 dtype: int64
Observations:
There are no missing values
df.duplicated().sum()
0
Observations:
There are no duplicate values
# setting the random seed via np.random.seed to get the same random results every time
np.random.seed(1)
# also look at random 10 sample rows
df.sample(n=10)
| ID | Age | Experience | Income | ZIPCode | Family | CCAvg | Education | Mortgage | Personal_Loan | Securities_Account | CD_Account | Online | CreditCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2764 | 2765 | 31 | 5 | 84 | 91320 | 1 | 2.9 | 3 | 105 | 0 | 0 | 0 | 0 | 1 |
| 4767 | 4768 | 35 | 9 | 45 | 90639 | 3 | 0.9 | 1 | 101 | 0 | 1 | 0 | 0 | 0 |
| 3814 | 3815 | 34 | 9 | 35 | 94304 | 3 | 1.3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3499 | 3500 | 49 | 23 | 114 | 94550 | 1 | 0.3 | 1 | 286 | 0 | 0 | 0 | 1 | 0 |
| 2735 | 2736 | 36 | 12 | 70 | 92131 | 3 | 2.6 | 2 | 165 | 0 | 0 | 0 | 1 | 0 |
| 3922 | 3923 | 31 | 4 | 20 | 95616 | 4 | 1.5 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
| 2701 | 2702 | 50 | 26 | 55 | 94305 | 1 | 1.6 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
| 1179 | 1180 | 36 | 11 | 98 | 90291 | 3 | 1.2 | 3 | 0 | 0 | 1 | 0 | 0 | 1 |
| 932 | 933 | 51 | 27 | 112 | 94720 | 3 | 1.8 | 2 | 0 | 0 | 1 | 1 | 1 | 1 |
| 792 | 793 | 41 | 16 | 98 | 93117 | 1 | 4.0 | 3 | 0 | 0 | 0 | 0 | 0 | 1 |
# Display first 5 rows
df.head()
| ID | Age | Experience | Income | ZIPCode | Family | CCAvg | Education | Mortgage | Personal_Loan | Securities_Account | CD_Account | Online | CreditCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 25 | 1 | 49 | 91107 | 4 | 1.6 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 1 | 2 | 45 | 19 | 34 | 90089 | 3 | 1.5 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | 3 | 39 | 15 | 11 | 94720 | 1 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 4 | 35 | 9 | 100 | 94112 | 1 | 2.7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 5 | 35 | 8 | 45 | 91330 | 4 | 1.0 | 2 | 0 | 0 | 0 | 0 | 0 | 1 |
# Display last 5 rows
df.tail()
| ID | Age | Experience | Income | ZIPCode | Family | CCAvg | Education | Mortgage | Personal_Loan | Securities_Account | CD_Account | Online | CreditCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4995 | 4996 | 29 | 3 | 40 | 92697 | 1 | 1.9 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4996 | 4997 | 30 | 4 | 15 | 92037 | 4 | 0.4 | 1 | 85 | 0 | 0 | 0 | 1 | 0 |
| 4997 | 4998 | 63 | 39 | 24 | 93023 | 2 | 0.3 | 3 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4998 | 4999 | 65 | 40 | 49 | 90034 | 3 | 0.5 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4999 | 5000 | 28 | 4 | 83 | 92612 | 3 | 0.8 | 1 | 0 | 0 | 0 | 0 | 1 | 1 |
Observations:
# check summary of dataset
df.describe(include='all').T # transpose for better visibility
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| ID | 5000.0 | 2500.500000 | 1443.520003 | 1.0 | 1250.75 | 2500.5 | 3750.25 | 5000.0 |
| Age | 5000.0 | 45.338400 | 11.463166 | 23.0 | 35.00 | 45.0 | 55.00 | 67.0 |
| Experience | 5000.0 | 20.104600 | 11.467954 | -3.0 | 10.00 | 20.0 | 30.00 | 43.0 |
| Income | 5000.0 | 73.774200 | 46.033729 | 8.0 | 39.00 | 64.0 | 98.00 | 224.0 |
| ZIPCode | 5000.0 | 93169.257000 | 1759.455086 | 90005.0 | 91911.00 | 93437.0 | 94608.00 | 96651.0 |
| Family | 5000.0 | 2.396400 | 1.147663 | 1.0 | 1.00 | 2.0 | 3.00 | 4.0 |
| CCAvg | 5000.0 | 1.937938 | 1.747659 | 0.0 | 0.70 | 1.5 | 2.50 | 10.0 |
| Education | 5000.0 | 1.881000 | 0.839869 | 1.0 | 1.00 | 2.0 | 3.00 | 3.0 |
| Mortgage | 5000.0 | 56.498800 | 101.713802 | 0.0 | 0.00 | 0.0 | 101.00 | 635.0 |
| Personal_Loan | 5000.0 | 0.096000 | 0.294621 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| Securities_Account | 5000.0 | 0.104400 | 0.305809 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| CD_Account | 5000.0 | 0.060400 | 0.238250 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| Online | 5000.0 | 0.596800 | 0.490589 | 0.0 | 0.00 | 1.0 | 1.00 | 1.0 |
| CreditCard | 5000.0 | 0.294000 | 0.455637 | 0.0 | 0.00 | 0.0 | 1.00 | 1.0 |
Observations:
# check the distinct value count
df["ZIPCode"].value_counts()
94720 169
94305 127
95616 116
90095 71
93106 57
...
96145 1
94087 1
91024 1
93077 1
94598 1
Name: ZIPCode, Length: 467, dtype: int64
# county column added to dataset using zip_county function
df["County"] = df["ZIPCode"].apply(zip_county)
# State column added to dataset using zip_state function
df["State"] = df["ZIPCode"].apply(zip_state)
# check the first 5 rows
df.head()
| ID | Age | Experience | Income | ZIPCode | Family | CCAvg | Education | Mortgage | Personal_Loan | Securities_Account | CD_Account | Online | CreditCard | County | State | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 25 | 1 | 49 | 91107 | 4 | 1.6 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | Los Angeles County | CA |
| 1 | 2 | 45 | 19 | 34 | 90089 | 3 | 1.5 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | Los Angeles County | CA |
| 2 | 3 | 39 | 15 | 11 | 94720 | 1 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | Alameda County | CA |
| 3 | 4 | 35 | 9 | 100 | 94112 | 1 | 2.7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | San Francisco County | CA |
| 4 | 5 | 35 | 8 | 45 | 91330 | 4 | 1.0 | 2 | 0 | 0 | 0 | 0 | 0 | 1 | Los Angeles County | CA |
Observations:
County and State columns are now added to the dataframe
# distinct value counts
df["State"].value_counts()
CA 4966 Name: State, dtype: int64
Observations:
# check the count of null values and see the data with missing data in State column
print(df.State.isnull().sum())
df[df.State.isnull()]
34
| ID | Age | Experience | Income | ZIPCode | Family | CCAvg | Education | Mortgage | Personal_Loan | Securities_Account | CD_Account | Online | CreditCard | County | State | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 106 | 107 | 43 | 17 | 69 | 92717 | 4 | 2.90 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | None | None |
| 172 | 173 | 38 | 13 | 171 | 92717 | 2 | 7.80 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | None | None |
| 184 | 185 | 52 | 26 | 63 | 92717 | 2 | 1.50 | 2 | 0 | 0 | 1 | 0 | 1 | 0 | None | None |
| 321 | 322 | 44 | 20 | 101 | 92717 | 3 | 4.40 | 2 | 82 | 1 | 0 | 0 | 0 | 0 | None | None |
| 366 | 367 | 50 | 24 | 35 | 92717 | 1 | 0.30 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | None | None |
| 384 | 385 | 51 | 25 | 21 | 93077 | 4 | 0.60 | 3 | 0 | 0 | 0 | 0 | 1 | 1 | None | None |
| 468 | 469 | 34 | 10 | 21 | 92634 | 1 | 0.50 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | None | None |
| 476 | 477 | 60 | 34 | 53 | 92717 | 1 | 0.80 | 2 | 0 | 0 | 1 | 0 | 0 | 1 | None | None |
| 630 | 631 | 32 | 7 | 35 | 96651 | 3 | 1.30 | 1 | 108 | 0 | 0 | 0 | 0 | 1 | None | None |
| 672 | 673 | 51 | 27 | 23 | 96651 | 1 | 0.20 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | None | None |
| 695 | 696 | 29 | 4 | 115 | 92717 | 1 | 1.90 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | None | None |
| 721 | 722 | 49 | 24 | 39 | 92717 | 1 | 1.40 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | None | None |
| 780 | 781 | 32 | 7 | 42 | 92634 | 4 | 0.80 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | None | None |
| 1099 | 1100 | 30 | 6 | 52 | 92717 | 3 | 0.70 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | None | None |
| 1189 | 1190 | 42 | 17 | 115 | 92717 | 2 | 0.40 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | None | None |
| 1426 | 1427 | 37 | 11 | 60 | 96651 | 3 | 0.50 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | None | None |
| 1483 | 1484 | 58 | 32 | 63 | 92717 | 1 | 1.60 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | None | None |
| 1653 | 1654 | 26 | 1 | 24 | 96651 | 2 | 0.90 | 3 | 123 | 0 | 0 | 0 | 0 | 1 | None | None |
| 1752 | 1753 | 33 | 8 | 155 | 92717 | 1 | 7.40 | 3 | 0 | 1 | 0 | 0 | 0 | 0 | None | None |
| 1844 | 1845 | 65 | 40 | 21 | 92717 | 3 | 0.10 | 3 | 0 | 0 | 0 | 0 | 0 | 1 | None | None |
| 2049 | 2050 | 43 | 18 | 94 | 92717 | 4 | 1.10 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | None | None |
| 2211 | 2212 | 39 | 14 | 31 | 92717 | 2 | 1.40 | 2 | 94 | 0 | 0 | 0 | 1 | 1 | None | None |
| 2218 | 2219 | 38 | 13 | 9 | 92634 | 2 | 0.30 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | None | None |
| 2428 | 2429 | 39 | 12 | 108 | 92717 | 4 | 3.67 | 2 | 301 | 1 | 0 | 0 | 0 | 1 | None | None |
| 2486 | 2487 | 61 | 36 | 130 | 92717 | 1 | 1.30 | 1 | 257 | 0 | 0 | 0 | 0 | 0 | None | None |
| 2731 | 2732 | 29 | 5 | 28 | 96651 | 1 | 0.20 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | None | None |
| 2957 | 2958 | 61 | 36 | 53 | 92717 | 3 | 0.50 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | None | None |
| 3525 | 3526 | 59 | 34 | 13 | 96651 | 4 | 0.90 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | None | None |
| 3887 | 3888 | 24 | -2 | 118 | 92634 | 2 | 7.20 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | None | None |
| 4090 | 4091 | 42 | 18 | 49 | 92717 | 3 | 2.10 | 3 | 0 | 0 | 1 | 0 | 1 | 0 | None | None |
| 4276 | 4277 | 50 | 24 | 155 | 92717 | 1 | 7.30 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | None | None |
| 4321 | 4322 | 27 | 0 | 34 | 92717 | 1 | 2.00 | 2 | 112 | 0 | 0 | 0 | 0 | 1 | None | None |
| 4384 | 4385 | 45 | 20 | 61 | 92717 | 3 | 2.70 | 2 | 0 | 0 | 0 | 0 | 0 | 1 | None | None |
| 4392 | 4393 | 52 | 27 | 81 | 92634 | 4 | 3.80 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | None | None |
Observations:
# update data for County and State based on ZIPCodes
df.loc[df['ZIPCode'] == 92717,["County","State"]] = ["Irvine","CA"]
df.loc[df['ZIPCode'] == 93077,["County","State"]] = ["Ventura","CA"]
df.loc[df['ZIPCode'] == 92634,["County","State"]] = ["Fullerton","CA"]
df.loc[df['ZIPCode'] == 96651,["County","State"]] = ["San Francisco","CA"]
# query the data for the ZIPCode that were updated
df.loc[df['ZIPCode'].isin([92717,93077,92634,96651])]
| ID | Age | Experience | Income | ZIPCode | Family | CCAvg | Education | Mortgage | Personal_Loan | Securities_Account | CD_Account | Online | CreditCard | County | State | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 106 | 107 | 43 | 17 | 69 | 92717 | 4 | 2.90 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | Irvine | CA |
| 172 | 173 | 38 | 13 | 171 | 92717 | 2 | 7.80 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | Irvine | CA |
| 184 | 185 | 52 | 26 | 63 | 92717 | 2 | 1.50 | 2 | 0 | 0 | 1 | 0 | 1 | 0 | Irvine | CA |
| 321 | 322 | 44 | 20 | 101 | 92717 | 3 | 4.40 | 2 | 82 | 1 | 0 | 0 | 0 | 0 | Irvine | CA |
| 366 | 367 | 50 | 24 | 35 | 92717 | 1 | 0.30 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | Irvine | CA |
| 384 | 385 | 51 | 25 | 21 | 93077 | 4 | 0.60 | 3 | 0 | 0 | 0 | 0 | 1 | 1 | Ventura | CA |
| 468 | 469 | 34 | 10 | 21 | 92634 | 1 | 0.50 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | Fullerton | CA |
| 476 | 477 | 60 | 34 | 53 | 92717 | 1 | 0.80 | 2 | 0 | 0 | 1 | 0 | 0 | 1 | Irvine | CA |
| 630 | 631 | 32 | 7 | 35 | 96651 | 3 | 1.30 | 1 | 108 | 0 | 0 | 0 | 0 | 1 | San Francisco | CA |
| 672 | 673 | 51 | 27 | 23 | 96651 | 1 | 0.20 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | San Francisco | CA |
| 695 | 696 | 29 | 4 | 115 | 92717 | 1 | 1.90 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | Irvine | CA |
| 721 | 722 | 49 | 24 | 39 | 92717 | 1 | 1.40 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | Irvine | CA |
| 780 | 781 | 32 | 7 | 42 | 92634 | 4 | 0.80 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | Fullerton | CA |
| 1099 | 1100 | 30 | 6 | 52 | 92717 | 3 | 0.70 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | Irvine | CA |
| 1189 | 1190 | 42 | 17 | 115 | 92717 | 2 | 0.40 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | Irvine | CA |
| 1426 | 1427 | 37 | 11 | 60 | 96651 | 3 | 0.50 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | San Francisco | CA |
| 1483 | 1484 | 58 | 32 | 63 | 92717 | 1 | 1.60 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | Irvine | CA |
| 1653 | 1654 | 26 | 1 | 24 | 96651 | 2 | 0.90 | 3 | 123 | 0 | 0 | 0 | 0 | 1 | San Francisco | CA |
| 1752 | 1753 | 33 | 8 | 155 | 92717 | 1 | 7.40 | 3 | 0 | 1 | 0 | 0 | 0 | 0 | Irvine | CA |
| 1844 | 1845 | 65 | 40 | 21 | 92717 | 3 | 0.10 | 3 | 0 | 0 | 0 | 0 | 0 | 1 | Irvine | CA |
| 2049 | 2050 | 43 | 18 | 94 | 92717 | 4 | 1.10 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | Irvine | CA |
| 2211 | 2212 | 39 | 14 | 31 | 92717 | 2 | 1.40 | 2 | 94 | 0 | 0 | 0 | 1 | 1 | Irvine | CA |
| 2218 | 2219 | 38 | 13 | 9 | 92634 | 2 | 0.30 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | Fullerton | CA |
| 2428 | 2429 | 39 | 12 | 108 | 92717 | 4 | 3.67 | 2 | 301 | 1 | 0 | 0 | 0 | 1 | Irvine | CA |
| 2486 | 2487 | 61 | 36 | 130 | 92717 | 1 | 1.30 | 1 | 257 | 0 | 0 | 0 | 0 | 0 | Irvine | CA |
| 2731 | 2732 | 29 | 5 | 28 | 96651 | 1 | 0.20 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | San Francisco | CA |
| 2957 | 2958 | 61 | 36 | 53 | 92717 | 3 | 0.50 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | Irvine | CA |
| 3525 | 3526 | 59 | 34 | 13 | 96651 | 4 | 0.90 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | San Francisco | CA |
| 3887 | 3888 | 24 | -2 | 118 | 92634 | 2 | 7.20 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | Fullerton | CA |
| 4090 | 4091 | 42 | 18 | 49 | 92717 | 3 | 2.10 | 3 | 0 | 0 | 1 | 0 | 1 | 0 | Irvine | CA |
| 4276 | 4277 | 50 | 24 | 155 | 92717 | 1 | 7.30 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | Irvine | CA |
| 4321 | 4322 | 27 | 0 | 34 | 92717 | 1 | 2.00 | 2 | 112 | 0 | 0 | 0 | 0 | 1 | Irvine | CA |
| 4384 | 4385 | 45 | 20 | 61 | 92717 | 3 | 2.70 | 2 | 0 | 0 | 0 | 0 | 0 | 1 | Irvine | CA |
| 4392 | 4393 | 52 | 27 | 81 | 92634 | 4 | 3.80 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | Fullerton | CA |
Observations:
# check the count of distinct negative values in Experience
df[df['Experience'] < 0]['Experience'].value_counts()
-1 33 -2 15 -3 4 Name: Experience, dtype: int64
Observations:
# filter rows with negative experience
df[df['Experience'] < 0]
| ID | Age | Experience | Income | ZIPCode | Family | CCAvg | Education | Mortgage | Personal_Loan | Securities_Account | CD_Account | Online | CreditCard | County | State | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 89 | 90 | 25 | -1 | 113 | 94303 | 4 | 2.30 | 3 | 0 | 0 | 0 | 0 | 0 | 1 | San Mateo County | CA |
| 226 | 227 | 24 | -1 | 39 | 94085 | 2 | 1.70 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | Santa Clara County | CA |
| 315 | 316 | 24 | -2 | 51 | 90630 | 3 | 0.30 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | Orange County | CA |
| 451 | 452 | 28 | -2 | 48 | 94132 | 2 | 1.75 | 3 | 89 | 0 | 0 | 0 | 1 | 0 | San Francisco County | CA |
| 524 | 525 | 24 | -1 | 75 | 93014 | 4 | 0.20 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | Santa Barbara County | CA |
| 536 | 537 | 25 | -1 | 43 | 92173 | 3 | 2.40 | 2 | 176 | 0 | 0 | 0 | 1 | 0 | San Diego County | CA |
| 540 | 541 | 25 | -1 | 109 | 94010 | 4 | 2.30 | 3 | 314 | 0 | 0 | 0 | 1 | 0 | San Mateo County | CA |
| 576 | 577 | 25 | -1 | 48 | 92870 | 3 | 0.30 | 3 | 0 | 0 | 0 | 0 | 0 | 1 | Orange County | CA |
| 583 | 584 | 24 | -1 | 38 | 95045 | 2 | 1.70 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | San Benito County | CA |
| 597 | 598 | 24 | -2 | 125 | 92835 | 2 | 7.20 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | Orange County | CA |
| 649 | 650 | 25 | -1 | 82 | 92677 | 4 | 2.10 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | Orange County | CA |
| 670 | 671 | 23 | -1 | 61 | 92374 | 4 | 2.60 | 1 | 239 | 0 | 0 | 0 | 1 | 0 | San Bernardino County | CA |
| 686 | 687 | 24 | -1 | 38 | 92612 | 4 | 0.60 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | Orange County | CA |
| 793 | 794 | 24 | -2 | 150 | 94720 | 2 | 2.00 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | Alameda County | CA |
| 889 | 890 | 24 | -2 | 82 | 91103 | 2 | 1.60 | 3 | 0 | 0 | 0 | 0 | 1 | 1 | Los Angeles County | CA |
| 909 | 910 | 23 | -1 | 149 | 91709 | 1 | 6.33 | 1 | 305 | 0 | 0 | 0 | 0 | 1 | San Bernardino County | CA |
| 1173 | 1174 | 24 | -1 | 35 | 94305 | 2 | 1.70 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | Santa Clara County | CA |
| 1428 | 1429 | 25 | -1 | 21 | 94583 | 4 | 0.40 | 1 | 90 | 0 | 0 | 0 | 1 | 0 | Contra Costa County | CA |
| 1522 | 1523 | 25 | -1 | 101 | 94720 | 4 | 2.30 | 3 | 256 | 0 | 0 | 0 | 0 | 1 | Alameda County | CA |
| 1905 | 1906 | 25 | -1 | 112 | 92507 | 2 | 2.00 | 1 | 241 | 0 | 0 | 0 | 1 | 0 | Riverside County | CA |
| 2102 | 2103 | 25 | -1 | 81 | 92647 | 2 | 1.60 | 3 | 0 | 0 | 0 | 0 | 1 | 1 | Orange County | CA |
| 2430 | 2431 | 23 | -1 | 73 | 92120 | 4 | 2.60 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | San Diego County | CA |
| 2466 | 2467 | 24 | -2 | 80 | 94105 | 2 | 1.60 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | San Francisco County | CA |
| 2545 | 2546 | 25 | -1 | 39 | 94720 | 3 | 2.40 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | Alameda County | CA |
| 2618 | 2619 | 23 | -3 | 55 | 92704 | 3 | 2.40 | 2 | 145 | 0 | 0 | 0 | 1 | 0 | Orange County | CA |
| 2717 | 2718 | 23 | -2 | 45 | 95422 | 4 | 0.60 | 2 | 0 | 0 | 0 | 0 | 1 | 1 | Lake County | CA |
| 2848 | 2849 | 24 | -1 | 78 | 94720 | 2 | 1.80 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | Alameda County | CA |
| 2876 | 2877 | 24 | -2 | 80 | 91107 | 2 | 1.60 | 3 | 238 | 0 | 0 | 0 | 0 | 0 | Los Angeles County | CA |
| 2962 | 2963 | 23 | -2 | 81 | 91711 | 2 | 1.80 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | Los Angeles County | CA |
| 2980 | 2981 | 25 | -1 | 53 | 94305 | 3 | 2.40 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | Santa Clara County | CA |
| 3076 | 3077 | 29 | -1 | 62 | 92672 | 2 | 1.75 | 3 | 0 | 0 | 0 | 0 | 0 | 1 | Orange County | CA |
| 3130 | 3131 | 23 | -2 | 82 | 92152 | 2 | 1.80 | 2 | 0 | 0 | 1 | 0 | 0 | 1 | San Diego County | CA |
| 3157 | 3158 | 23 | -1 | 13 | 94720 | 4 | 1.00 | 1 | 84 | 0 | 0 | 0 | 1 | 0 | Alameda County | CA |
| 3279 | 3280 | 26 | -1 | 44 | 94901 | 1 | 2.00 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | Marin County | CA |
| 3284 | 3285 | 25 | -1 | 101 | 95819 | 4 | 2.10 | 3 | 0 | 0 | 0 | 0 | 0 | 1 | Sacramento County | CA |
| 3292 | 3293 | 25 | -1 | 13 | 95616 | 4 | 0.40 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | Yolo County | CA |
| 3394 | 3395 | 25 | -1 | 113 | 90089 | 4 | 2.10 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | Los Angeles County | CA |
| 3425 | 3426 | 23 | -1 | 12 | 91605 | 4 | 1.00 | 1 | 90 | 0 | 0 | 0 | 1 | 0 | Los Angeles County | CA |
| 3626 | 3627 | 24 | -3 | 28 | 90089 | 4 | 1.00 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | Los Angeles County | CA |
| 3796 | 3797 | 24 | -2 | 50 | 94920 | 3 | 2.40 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | Marin County | CA |
| 3824 | 3825 | 23 | -1 | 12 | 95064 | 4 | 1.00 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | Santa Cruz County | CA |
| 3887 | 3888 | 24 | -2 | 118 | 92634 | 2 | 7.20 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | Fullerton | CA |
| 3946 | 3947 | 25 | -1 | 40 | 93117 | 3 | 2.40 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | Santa Barbara County | CA |
| 4015 | 4016 | 25 | -1 | 139 | 93106 | 2 | 2.00 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | Santa Barbara County | CA |
| 4088 | 4089 | 29 | -1 | 71 | 94801 | 2 | 1.75 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | Contra Costa County | CA |
| 4116 | 4117 | 24 | -2 | 135 | 90065 | 2 | 7.20 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | Los Angeles County | CA |
| 4285 | 4286 | 23 | -3 | 149 | 93555 | 2 | 7.20 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | Kern County | CA |
| 4411 | 4412 | 23 | -2 | 75 | 90291 | 2 | 1.80 | 2 | 0 | 0 | 0 | 0 | 1 | 1 | Los Angeles County | CA |
| 4481 | 4482 | 25 | -2 | 35 | 95045 | 4 | 1.00 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | San Benito County | CA |
| 4514 | 4515 | 24 | -3 | 41 | 91768 | 4 | 1.00 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | Los Angeles County | CA |
| 4582 | 4583 | 25 | -1 | 69 | 92691 | 3 | 0.30 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | Orange County | CA |
| 4957 | 4958 | 29 | -1 | 50 | 95842 | 2 | 1.75 | 3 | 0 | 0 | 0 | 0 | 0 | 1 | Sacramento County | CA |
Observations:
# check the correlation of the dataset
df.corr()
| ID | Age | Experience | Income | ZIPCode | Family | CCAvg | Education | Mortgage | Personal_Loan | Securities_Account | CD_Account | Online | CreditCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ID | 1.000000 | -0.008473 | -0.008326 | -0.017695 | 0.002240 | -0.016797 | -0.024675 | 0.021463 | -0.013920 | -0.024801 | -0.016972 | -0.006909 | -0.002528 | 0.017028 |
| Age | -0.008473 | 1.000000 | 0.994215 | -0.055269 | -0.030530 | -0.046418 | -0.052012 | 0.041334 | -0.012539 | -0.007726 | -0.000436 | 0.008043 | 0.013702 | 0.007681 |
| Experience | -0.008326 | 0.994215 | 1.000000 | -0.046574 | -0.030456 | -0.052563 | -0.050077 | 0.013152 | -0.010582 | -0.007413 | -0.001232 | 0.010353 | 0.013898 | 0.008967 |
| Income | -0.017695 | -0.055269 | -0.046574 | 1.000000 | -0.030709 | -0.157501 | 0.645984 | -0.187524 | 0.206806 | 0.502462 | -0.002616 | 0.169738 | 0.014206 | -0.002385 |
| ZIPCode | 0.002240 | -0.030530 | -0.030456 | -0.030709 | 1.000000 | 0.027512 | -0.012188 | -0.008266 | 0.003614 | -0.002974 | 0.002422 | 0.021671 | 0.028317 | 0.024033 |
| Family | -0.016797 | -0.046418 | -0.052563 | -0.157501 | 0.027512 | 1.000000 | -0.109275 | 0.064929 | -0.020445 | 0.061367 | 0.019994 | 0.014110 | 0.010354 | 0.011588 |
| CCAvg | -0.024675 | -0.052012 | -0.050077 | 0.645984 | -0.012188 | -0.109275 | 1.000000 | -0.136124 | 0.109905 | 0.366889 | 0.015086 | 0.136534 | -0.003611 | -0.006689 |
| Education | 0.021463 | 0.041334 | 0.013152 | -0.187524 | -0.008266 | 0.064929 | -0.136124 | 1.000000 | -0.033327 | 0.136722 | -0.010812 | 0.013934 | -0.015004 | -0.011014 |
| Mortgage | -0.013920 | -0.012539 | -0.010582 | 0.206806 | 0.003614 | -0.020445 | 0.109905 | -0.033327 | 1.000000 | 0.142095 | -0.005411 | 0.089311 | -0.005995 | -0.007231 |
| Personal_Loan | -0.024801 | -0.007726 | -0.007413 | 0.502462 | -0.002974 | 0.061367 | 0.366889 | 0.136722 | 0.142095 | 1.000000 | 0.021954 | 0.316355 | 0.006278 | 0.002802 |
| Securities_Account | -0.016972 | -0.000436 | -0.001232 | -0.002616 | 0.002422 | 0.019994 | 0.015086 | -0.010812 | -0.005411 | 0.021954 | 1.000000 | 0.317034 | 0.012627 | -0.015028 |
| CD_Account | -0.006909 | 0.008043 | 0.010353 | 0.169738 | 0.021671 | 0.014110 | 0.136534 | 0.013934 | 0.089311 | 0.316355 | 0.317034 | 1.000000 | 0.175880 | 0.278644 |
| Online | -0.002528 | 0.013702 | 0.013898 | 0.014206 | 0.028317 | 0.010354 | -0.003611 | -0.015004 | -0.005995 | 0.006278 | 0.012627 | 0.175880 | 1.000000 | 0.004210 |
| CreditCard | 0.017028 | 0.007681 | 0.008967 | -0.002385 | 0.024033 | 0.011588 | -0.006689 | -0.011014 | -0.007231 | 0.002802 | -0.015028 | 0.278644 | 0.004210 | 1.000000 |
Observations:
# Create two different dataframes with records where experience value is greater than 0 and lesser than 0 respectively
# Get the list of Customer IDs from the dataframe containing records with negative experience values
# Iterating over the Customer ID list
# Get the Age and Education level for the corresponding ID from the negative experience dataframe
# Filter the records form the positive experience dataframe based on the obtained age and education value
# calculate the median experience value from the filtered dataframe and store in "experience" variable
# if the filtered dataframe is empty, filter the negative experience dataframe and obtain the median experience value
# Replace the negative experience with the absolute value of the median "experience" value
df_positive_experience = df[df['Experience'] > 0]
df_negative_experience = df[df['Experience'] < 0]
negative_experience_id_list = df_negative_experience['ID'].tolist()
for id in negative_experience_id_list:
age = df.loc[np.where(df['ID']==id)]['Age'].tolist()[0]
education = df.loc[np.where(df['ID']==id)]['Education'].tolist()[0]
positive_experience_filtered = df_positive_experience[(df_positive_experience['Age'] == age) &
(df_positive_experience['Education'] == education)]
if positive_experience_filtered.empty:
negative_experience_filtered = df_negative_experience[(df_negative_experience['Age'] == age) &
(df_negative_experience['Education'] == education)]
experience = round(negative_experience_filtered['Experience'].median())
else:
experience = round(positive_experience_filtered['Experience'].median())
df.loc[df.ID == id, 'Experience'] = abs(experience)
# check the count
df[df['Experience'] < 0]['Experience'].count()
0
Observations:
There are no records with negative experience
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| ID | 5000.0 | 2500.500000 | 1443.520003 | 1.0 | 1250.75 | 2500.5 | 3750.25 | 5000.0 |
| Age | 5000.0 | 45.338400 | 11.463166 | 23.0 | 35.00 | 45.0 | 55.00 | 67.0 |
| Experience | 5000.0 | 20.134800 | 11.414863 | 0.0 | 10.00 | 20.0 | 30.00 | 43.0 |
| Income | 5000.0 | 73.774200 | 46.033729 | 8.0 | 39.00 | 64.0 | 98.00 | 224.0 |
| ZIPCode | 5000.0 | 93169.257000 | 1759.455086 | 90005.0 | 91911.00 | 93437.0 | 94608.00 | 96651.0 |
| Family | 5000.0 | 2.396400 | 1.147663 | 1.0 | 1.00 | 2.0 | 3.00 | 4.0 |
| CCAvg | 5000.0 | 1.937938 | 1.747659 | 0.0 | 0.70 | 1.5 | 2.50 | 10.0 |
| Education | 5000.0 | 1.881000 | 0.839869 | 1.0 | 1.00 | 2.0 | 3.00 | 3.0 |
| Mortgage | 5000.0 | 56.498800 | 101.713802 | 0.0 | 0.00 | 0.0 | 101.00 | 635.0 |
| Personal_Loan | 5000.0 | 0.096000 | 0.294621 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| Securities_Account | 5000.0 | 0.104400 | 0.305809 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| CD_Account | 5000.0 | 0.060400 | 0.238250 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| Online | 5000.0 | 0.596800 | 0.490589 | 0.0 | 0.00 | 1.0 | 1.00 | 1.0 |
| CreditCard | 5000.0 | 0.294000 | 0.455637 | 0.0 | 0.00 | 0.0 | 1.00 | 1.0 |
Observations:
# histogram and boxplot for ID
histogram_boxplot(df, "ID")
Observations:
# histogram, barplot and boxplot for Age
histogram_boxplot(df, "Age")
Observations:
# histogram, boxplot and barplot for Experience
histogram_boxplot(df, "Experience")
Observations:
# histogram and boxplot for Income
histogram_boxplot(df, "Income")
Observations:
# histogram and boxplot for ZIPCode
histogram_boxplot(df, "ZIPCode")
Observations:
# barplot for State
labeled_barplot(df,"State",True)
Observations:
# barplot for County
labeled_barplot(df,"County",True)
Observations:
# barplot for Family
labeled_barplot(df,"Family",True)
Observations:
# histogram and boxplot for CCAvg
histogram_boxplot(df, "CCAvg")
Observations:
# barplot for Education
labeled_barplot(df,"Education",True)
Observations:
# histogram and boxplot for Mortgage
histogram_boxplot(df, "Mortgage")
Observations:
# barplot for Personal_Loan
labeled_barplot(df,"Personal_Loan",True)
Observations:
# barplot for Securities_Account
labeled_barplot(df,"Securities_Account",True)
Observations:
# barplot for CD_Account
labeled_barplot(df,"CD_Account",True)
Observations:
# barplot for Online
labeled_barplot(df,"Online",True)
Observations:
# barplot for CreditCard
labeled_barplot(df,"CreditCard",True)
Observations:
# stacked barplot of Personal_Loan in comparison to Credit Card
stacked_barplot(df, 'CreditCard', 'Personal_Loan')
Observations:
There is marginal difference in percentage of customers using Credit Card applying for credit card, compared to the customers without the credit card.
# stacked barplot of Personal_Loan in comparison to Age
stacked_barplot(df, 'Age', 'Personal_Loan')
Observations:
# stacked barplot of Personal_Loan in comparison to Experience
stacked_barplot(df, 'Experience', 'Personal_Loan')
Observations:
# stacked barplot of Personal_Loan in comparison to County
stacked_barplot(df, 'County', 'Personal_Loan')
Observations:
# stacked barplot of Personal_Loan in comparison to Family
stacked_barplot(df, 'Family', 'Personal_Loan')
Observations:
Customers with family of 3 contribute to slightly higher percentage compared to others.
# stacked barplot of Personal_Loan in comparison to Education
stacked_barplot(df, 'Education', 'Personal_Loan')
Observations:
# stacked barplot of Personal_Loan in comparison to Securities Account
stacked_barplot(df, 'Securities_Account', 'Personal_Loan')
Observations:
Securities Account has slight impact on the probability of customers applying for personal loan
# stacked barplot of Personal_Loan in comparison to CD Account
stacked_barplot(df, 'CD_Account', 'Personal_Loan')
Observations:
Customer with CD Accounts have shown more interest in Personal Loan.
# stacked barplot of Personal_Loan in basec on Online facility
stacked_barplot(df, 'Online', 'Personal_Loan')
Observations:
Customer using Online Banking feature has a very little impact on the personal loan interest.
# heatmap for correlation
plt.figure(figsize=(15, 7))
sns.heatmap(df.corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral")
plt.show()
Observations:
# boxplot of Education and Income with Personal_Loan hue
plt.figure(figsize=(10,5))
sns.boxplot(x='Education', y='Income', hue='Personal_Loan', data=df)
plt.show()
Observations:
# boxplot of Education and Mortgage with Personal_Loan hue
plt.figure(figsize=(10,5))
sns.boxplot(x='Education', y='Mortgage', hue='Personal_Loan', data=df)
plt.show()
Observation:
Customers of any education level but with high value of house mortgage, have higher chances of taking a loan compared to those of the same education level but with low house mortgage value.
# boxplot of CreditCard and CCAVG with Personal_Loan hue
plt.figure(figsize=(10,5))
sns.boxplot(x="CreditCard", y='CCAvg', hue="Personal_Loan", data=df)
plt.show()
Observation:
Customers who have credit card and have higher monthly average spend are more likely to avail Personal_loan.
# joint plote of Age and Experience with hue of Personal_Loan
plt.figure(figsize=(10,5))
sns.jointplot(x='Age', y='Experience', hue='Personal_Loan', data = df)
plt.show()
<Figure size 1000x500 with 0 Axes>
Observations:
We have seen during Univariate Analysis that Mortgage, Income and CCAvg has outliers. We will try to treat the outliers using the functions defined and see if that reduces the skewness.
Tried the model building without treating the outliers, the performance is better when outliers are treated.
#create a list of numerical columns
numerical_col = ["Mortgage","Income","CCAvg"]
#check the outliers before treating
plt.figure(figsize=(20, 30))
for i, variable in enumerate(numerical_col):
plt.subplot(5, 4, i + 1)
plt.boxplot(df[variable], whis=1.5)
plt.tight_layout()
plt.title(variable)
plt.show()
#treat the outliers using the function defined
df = treat_outliers_all(df, numerical_col)
#verify the numerical columns after outliers treatment
plt.figure(figsize=(20, 30))
for i, variable in enumerate(numerical_col):
plt.subplot(5, 4, i + 1)
plt.boxplot(df[variable], whis=1.5)
plt.tight_layout()
plt.title(variable)
plt.show()
df.head()
| ID | Age | Experience | Income | ZIPCode | Family | CCAvg | Education | Mortgage | Personal_Loan | Securities_Account | CD_Account | Online | CreditCard | County | State | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 25 | 1 | 49.0 | 91107 | 4 | 1.6 | 1 | 0.0 | 0 | 1 | 0 | 0 | 0 | Los Angeles County | CA |
| 1 | 2 | 45 | 19 | 34.0 | 90089 | 3 | 1.5 | 1 | 0.0 | 0 | 1 | 0 | 0 | 0 | Los Angeles County | CA |
| 2 | 3 | 39 | 15 | 11.0 | 94720 | 1 | 1.0 | 1 | 0.0 | 0 | 0 | 0 | 0 | 0 | Alameda County | CA |
| 3 | 4 | 35 | 9 | 100.0 | 94112 | 1 | 2.7 | 2 | 0.0 | 0 | 0 | 0 | 0 | 0 | San Francisco County | CA |
| 4 | 5 | 35 | 8 | 45.0 | 91330 | 4 | 1.0 | 2 | 0.0 | 0 | 0 | 0 | 0 | 1 | Los Angeles County | CA |
Both the cases are important as:
If we predict that Customer would accept personal loan but the Customer rejects the offer, will cost the Banking the marketing effort and incidental costs associated with it.
Since the large part of dataset shows that Customers does not accept the Personal Loan offer the Prediction will also be that most of the Customers would not accept the Personal Loan offer. However going by that principle Bank should not stop marketing the product to such customers. As there would certain cases where Customer may not have availed the Personal Loan in past but in future the requirement may change.
f1_score should be maximized, the greater the f1_score higher the chances of identifying both the classes correctly.# defining the dependent and independent variables
X = df.drop(["ID","Personal_Loan","ZIPCode","State"], axis=1)
y = df["Personal_Loan"]
#creating dummy variables
X = pd.get_dummies(
X,
columns=X.select_dtypes(include=["object", "category"]).columns.tolist(),
drop_first=True,
)
#check the sample data
X.head(10)
| Age | Experience | Income | Family | CCAvg | Education | Mortgage | Securities_Account | CD_Account | Online | CreditCard | County_Butte County | County_Contra Costa County | County_El Dorado County | County_Fresno County | County_Fullerton | County_Humboldt County | County_Imperial County | County_Irvine | County_Kern County | County_Lake County | County_Los Angeles County | County_Marin County | County_Mendocino County | County_Merced County | County_Monterey County | County_Napa County | County_Orange County | County_Placer County | County_Riverside County | County_Sacramento County | County_San Benito County | County_San Bernardino County | County_San Diego County | County_San Francisco | County_San Francisco County | County_San Joaquin County | County_San Luis Obispo County | County_San Mateo County | County_Santa Barbara County | County_Santa Clara County | County_Santa Cruz County | County_Shasta County | County_Siskiyou County | County_Solano County | County_Sonoma County | County_Stanislaus County | County_Trinity County | County_Tuolumne County | County_Ventura | County_Ventura County | County_Yolo County | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 25 | 1 | 49.0 | 4 | 1.6 | 1 | 0.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 45 | 19 | 34.0 | 3 | 1.5 | 1 | 0.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 39 | 15 | 11.0 | 1 | 1.0 | 1 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 35 | 9 | 100.0 | 1 | 2.7 | 2 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 35 | 8 | 45.0 | 4 | 1.0 | 2 | 0.0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 5 | 37 | 13 | 29.0 | 4 | 0.4 | 2 | 155.0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 6 | 53 | 27 | 72.0 | 2 | 1.5 | 2 | 0.0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 7 | 50 | 24 | 22.0 | 1 | 0.3 | 3 | 0.0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 8 | 35 | 10 | 81.0 | 3 | 0.6 | 2 | 104.0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 9 | 34 | 9 | 180.0 | 1 | 5.2 | 3 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
# splitting the data in 70:30 ratio for train to test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
print("Number of rows in train data =", X_train.shape[0])
print("Number of rows in test data =", X_test.shape[0])
Number of rows in train data = 3500 Number of rows in test data = 1500
print("Original Personal_Loan True Values : {0} ({1:0.2f}%)".format(len(df.loc[df['Personal_Loan'] == 1]), (len(df.loc[df['Personal_Loan'] == 1])/len(df.index)) * 100))
print("Original Personal_Loan False Values : {0} ({1:0.2f}%)".format(len(df.loc[df['Personal_Loan'] == 0]), (len(df.loc[df['Personal_Loan'] == 0])/len(df.index)) * 100))
print("")
print("Training Personal_Loan True Values : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 1]), (len(y_train[y_train[:] == 1])/len(y_train)) * 100))
print("Training Personal_Loan False Values : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 0]), (len(y_train[y_train[:] == 0])/len(y_train)) * 100))
print("")
print("Test Personal_Loan True Values : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 1]), (len(y_test[y_test[:] == 1])/len(y_test)) * 100))
print("Test Personal_Loan False Values : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 0]), (len(y_test[y_test[:] == 0])/len(y_test)) * 100))
print("")
Original Personal_Loan True Values : 480 (9.60%) Original Personal_Loan False Values : 4520 (90.40%) Training Personal_Loan True Values : 331 (9.46%) Training Personal_Loan False Values : 3169 (90.54%) Test Personal_Loan True Values : 149 (9.93%) Test Personal_Loan False Values : 1351 (90.07%)
# There are different solvers available in Sklearn logistic regression
# The newton-cg solver is faster for high-dimensional data
lg = LogisticRegression(solver="newton-cg", random_state=1)
model = lg.fit(X_train, y_train)
# let us check the coefficients and intercept of the model
coef_df = pd.DataFrame(
np.append(lg.coef_, lg.intercept_),
index=X_train.columns.tolist() + ["Intercept"],
columns=["Coefficients"],
)
coef_df.T
| Age | Experience | Income | Family | CCAvg | Education | Mortgage | Securities_Account | CD_Account | Online | CreditCard | County_Butte County | County_Contra Costa County | County_El Dorado County | County_Fresno County | County_Fullerton | County_Humboldt County | County_Imperial County | County_Irvine | County_Kern County | County_Lake County | County_Los Angeles County | County_Marin County | County_Mendocino County | County_Merced County | County_Monterey County | County_Napa County | County_Orange County | County_Placer County | County_Riverside County | County_Sacramento County | County_San Benito County | County_San Bernardino County | County_San Diego County | County_San Francisco | County_San Francisco County | County_San Joaquin County | County_San Luis Obispo County | County_San Mateo County | County_Santa Barbara County | County_Santa Clara County | County_Santa Cruz County | County_Shasta County | County_Siskiyou County | County_Solano County | County_Sonoma County | County_Stanislaus County | County_Trinity County | County_Tuolumne County | County_Ventura | County_Ventura County | County_Yolo County | Intercept | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Coefficients | -0.019376 | 0.027861 | 0.055192 | 0.772491 | 0.395431 | 1.759719 | 0.001153 | -0.829597 | 3.221198 | -0.579582 | -0.985863 | -0.24135 | 0.688158 | -0.137179 | -0.005913 | -0.015475 | -0.310083 | -0.013803 | 0.607168 | 0.56593 | -0.00417 | 0.03994 | 0.440216 | -0.036756 | -0.201645 | -0.032475 | -0.004726 | -0.184521 | 0.692513 | 0.994989 | 0.112618 | -0.223128 | -0.791151 | 0.070062 | -0.000654 | 0.321675 | 0.014135 | -0.428411 | -1.010522 | 0.142678 | 0.21355 | 0.159954 | -0.17004 | -0.021965 | 0.239287 | 0.487245 | -0.256819 | -0.091925 | -0.13747 | -0.001773 | 0.187596 | -0.442845 | -14.239595 |
Observations:
# converting coefficients to odds
odds = np.exp(lg.coef_[0])
# finding the percentage change
perc_change_odds = (np.exp(lg.coef_[0]) - 1) * 100
# removing limit from number of columns to display
pd.set_option("display.max_columns", None)
# adding the odds to a dataframe
pd.DataFrame({"Odds": odds, "Change_odd%": perc_change_odds}, index=X_train.columns).T
| Age | Experience | Income | Family | CCAvg | Education | Mortgage | Securities_Account | CD_Account | Online | CreditCard | County_Butte County | County_Contra Costa County | County_El Dorado County | County_Fresno County | County_Fullerton | County_Humboldt County | County_Imperial County | County_Irvine | County_Kern County | County_Lake County | County_Los Angeles County | County_Marin County | County_Mendocino County | County_Merced County | County_Monterey County | County_Napa County | County_Orange County | County_Placer County | County_Riverside County | County_Sacramento County | County_San Benito County | County_San Bernardino County | County_San Diego County | County_San Francisco | County_San Francisco County | County_San Joaquin County | County_San Luis Obispo County | County_San Mateo County | County_Santa Barbara County | County_Santa Clara County | County_Santa Cruz County | County_Shasta County | County_Siskiyou County | County_Solano County | County_Sonoma County | County_Stanislaus County | County_Trinity County | County_Tuolumne County | County_Ventura | County_Ventura County | County_Yolo County | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Odds | 0.980811 | 1.028253 | 1.056744 | 2.165154 | 1.485024 | 5.810802 | 1.001153 | 0.436225 | 25.058111 | 0.560132 | 0.373117 | 0.785566 | 1.990046 | 0.871814 | 0.994104 | 0.984644 | 0.733386 | 0.986292 | 1.835228 | 1.761085 | 0.995838 | 1.040748 | 1.553043 | 0.963912 | 0.817385 | 0.968047 | 0.995286 | 0.831502 | 1.998732 | 2.704695 | 1.119205 | 0.800012 | 0.453323 | 1.072575 | 0.999346 | 1.379436 | 1.014235 | 0.651543 | 0.364029 | 1.153359 | 1.238066 | 1.173456 | 0.843631 | 0.978275 | 1.270343 | 1.627826 | 0.773508 | 0.912173 | 0.871561 | 0.998228 | 1.206346 | 0.642207 |
| Change_odd% | -1.918936 | 2.825296 | 5.674380 | 116.515381 | 48.502385 | 481.080188 | 0.115322 | -56.377515 | 2405.811113 | -43.986759 | -62.688309 | -21.443355 | 99.004568 | -12.818621 | -0.589557 | -1.535594 | -26.661388 | -1.370787 | 83.522754 | 76.108451 | -0.416168 | 4.074801 | 55.304259 | -3.608843 | -18.261495 | -3.195334 | -0.471444 | -16.849758 | 99.873248 | 170.469539 | 11.920478 | -19.998759 | -54.667707 | 7.257487 | -0.065428 | 37.943572 | 1.423526 | -34.845664 | -63.597100 | 15.335881 | 23.806581 | 17.345644 | -15.636926 | -2.172515 | 27.034285 | 62.782597 | -22.649162 | -8.782685 | -12.843933 | -0.177167 | 20.634639 | -35.779337 |
Observations:
# creating confusion matrix
confusion_matrix_sklearn_with_threshold(lg, X_train, y_train)
plt.show()
#use the defined function classify the model
log_reg_model_train_perf = model_performance_classification_sklearn_with_threshold(
lg, X_train, y_train
)
#print the scores
print("Training performance:")
log_reg_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.960571 | 0.685801 | 0.869732 | 0.766892 |
# creating confusion matrix
confusion_matrix_sklearn_with_threshold(lg, X_test, y_test)
plt.show()
#use the defined function classify the modelx
log_reg_model_test_perf = model_performance_classification_sklearn_with_threshold(
lg, X_test, y_test
)
#print the scores
print("Training performance:")
log_reg_model_test_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.947333 | 0.590604 | 0.830189 | 0.690196 |
logit_roc_auc_train = roc_auc_score(y_train, lg.predict_proba(X_train)[:, 1])
fpr, tpr, thresholds = roc_curve(y_train, lg.predict_proba(X_train)[:, 1])
plt.figure(figsize=(7, 5))
plt.plot(fpr, tpr, label="Logistic Regression (area = %0.2f)" % logit_roc_auc_train)
plt.plot([0, 1], [0, 1], "r--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic")
plt.legend(loc="lower right")
plt.show()
logit_roc_auc_test = roc_auc_score(y_test, lg.predict_proba(X_test)[:, 1])
fpr, tpr, thresholds = roc_curve(y_test, lg.predict_proba(X_test)[:, 1])
plt.figure(figsize=(7, 5))
plt.plot(fpr, tpr, label="Logistic Regression (area = %0.2f)" % logit_roc_auc_test)
plt.plot([0, 1], [0, 1], "r--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic")
plt.legend(loc="lower right")
plt.show()
Observations:
Model is giving Generalized performance on both Training and Testing Sets
Try to improve F1 Score, by changing the model threshold using AUC-ROC Curve.
# Optimal threshold as per AUC-ROC curve
# The optimal cut off would be where tpr is high and fpr is low
fpr, tpr, thresholds = roc_curve(y_train, lg.predict_proba(X_train)[:, 1])
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold_auc_roc = thresholds[optimal_idx]
print(optimal_threshold_auc_roc)
0.11887112484837645
# creating confusion matrix
confusion_matrix_sklearn_with_threshold(
lg, X_train, y_train, threshold=optimal_threshold_auc_roc
)
# checking model performance for this model
log_reg_model_train_perf_threshold_auc_roc = model_performance_classification_sklearn_with_threshold(
lg, X_train, y_train, threshold=optimal_threshold_auc_roc
)
print("Training performance:")
log_reg_model_train_perf_threshold_auc_roc
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.911143 | 0.89426 | 0.517483 | 0.655592 |
# creating confusion matrix
confusion_matrix_sklearn_with_threshold(
lg, X_test, y_test, threshold=optimal_threshold_auc_roc
)
# checking model performance for this model
log_reg_model_test_perf_threshold_auc_roc = model_performance_classification_sklearn_with_threshold(
lg, X_test, y_test, threshold=optimal_threshold_auc_roc
)
print("Test set performance:")
log_reg_model_test_perf_threshold_auc_roc
Test set performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.910667 | 0.852349 | 0.531381 | 0.654639 |
Observations:
The recall of the model for both training and test set has improved but the F1 score has reduced.
#use the function created to plot the precision and recall vs threshold
y_scores = lg.predict_proba(X_train)[:, 1]
prec, rec, tre = precision_recall_curve(y_train, y_scores,)
plot_prec_recall_vs_tresh(prec, rec, tre)
Observations:
At 0.35 threshold we get balance precision and recall
# setting the threshold
optimal_threshold_curve = 0.35
# creating confusion matrix
confusion_matrix_sklearn_with_threshold(
lg, X_train, y_train, threshold=optimal_threshold_curve
)
# checking model performance for this model
log_reg_model_train_perf_threshold_curve = model_performance_classification_sklearn_with_threshold(
lg, X_train, y_train, threshold=optimal_threshold_curve
)
print("Training performance:")
log_reg_model_train_perf_threshold_curve
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.954286 | 0.755287 | 0.759878 | 0.757576 |
# creating confusion matrix
confusion_matrix_sklearn_with_threshold(
lg, X_test, y_test, threshold=optimal_threshold_curve
)
# checking model performance for this model
log_reg_model_test_perf_threshold_curve = model_performance_classification_sklearn_with_threshold(
lg, X_test, y_test, threshold=optimal_threshold_curve
)
print("Test set performance:")
log_reg_model_test_perf_threshold_curve
Test set performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.945333 | 0.671141 | 0.75188 | 0.70922 |
Observations:
F1 Score has improved a little
# training performance comparison
models_train_comp_df = pd.concat(
[
log_reg_model_train_perf.T,
log_reg_model_train_perf_threshold_auc_roc.T,
log_reg_model_train_perf_threshold_curve.T
],
axis=1,
)
models_train_comp_df.columns = [
"Logistic Regression Default",
"Logistic Regression ROC-AUC",
"Logistic Regression RECALL and PRECESSION VS Threshold"
]
print("Training performance comparison:")
models_train_comp_df
Training performance comparison:
| Logistic Regression Default | Logistic Regression ROC-AUC | Logistic Regression RECALL and PRECESSION VS Threshold | |
|---|---|---|---|
| Accuracy | 0.960571 | 0.911143 | 0.954286 |
| Recall | 0.685801 | 0.894260 | 0.755287 |
| Precision | 0.869732 | 0.517483 | 0.759878 |
| F1 | 0.766892 | 0.655592 | 0.757576 |
# testing performance comparison
models_test_comp_df = pd.concat(
[
log_reg_model_test_perf.T,
log_reg_model_test_perf_threshold_auc_roc.T,
log_reg_model_test_perf_threshold_curve.T
],
axis=1,
)
models_test_comp_df.columns = [
"Logistic Regression Default",
"Logistic Regression ROC-AUC",
"Logistic Regression RECALL and PRECESSION VS Threshold"
]
print("Test set performance comparison:")
models_test_comp_df
Test set performance comparison:
| Logistic Regression Default | Logistic Regression ROC-AUC | Logistic Regression RECALL and PRECESSION VS Threshold | |
|---|---|---|---|
| Accuracy | 0.947333 | 0.910667 | 0.945333 |
| Recall | 0.590604 | 0.852349 | 0.671141 |
| Precision | 0.830189 | 0.531381 | 0.751880 |
| F1 | 0.690196 | 0.654639 | 0.709220 |
Observations:
Both the cases are important as:
If we predict that Customer would accept personal loan but the Customer rejects the offer, will cost the Banking the marketing effort and incidental costs associated with it.
Since the large part of dataset shows that Customers does not accept the Personal Loan offer the Prediction will also be that most of the Customers would not accept the Personal Loan offer. However going by that principle Bank should not stop marketing the product to such customers. As there would certain cases where Customer may not have availed the Personal Loan in past but in future the requirement may change.
recall should be maximized, the greater the f1_score higher the chances of identifying both the classes correctly.
#build a model using DecisionTreeClassifier
model = DecisionTreeClassifier(
criterion="gini", class_weight={0: 0.15, 1: 0.85}, random_state=1
)
model.fit(X_train, y_train)
DecisionTreeClassifier(class_weight={0: 0.15, 1: 0.85}, random_state=1)
#draw the confusion matrix
confusion_matrix_sklearn(model, X_train, y_train)
#print the recall score
decision_tree_perf_train = get_recall_score(model, X_train, y_train)
print("Recall Score:", decision_tree_perf_train)
Recall Score: 1.0
Observations:
#print confusion matrix
confusion_matrix_sklearn(model, X_test, y_test)
#print the recall score
decision_tree_perf_test = get_recall_score(model, X_test, y_test)
print("Recall Score:", decision_tree_perf_test)
Recall Score: 0.8456375838926175
Observations:
## creating a list of column names
feature_names = X_train.columns.to_list()
#plot the decision tree
plt.figure(figsize=(20, 30))
out = tree.plot_tree(
model,
feature_names=feature_names,
filled=True,
fontsize=9,
node_ids=False,
class_names=None,
)
# below code will add arrows to the decision tree split if they are missing
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor("black")
arrow.set_linewidth(1)
plt.show()
# Text report showing the rules of a decision tree -
print(tree.export_text(model, feature_names=feature_names, show_weights=True))
|--- Income <= 98.50 | |--- CCAvg <= 2.95 | | |--- weights: [374.10, 0.00] class: 0 | |--- CCAvg > 2.95 | | |--- CD_Account <= 0.50 | | | |--- CCAvg <= 3.95 | | | | |--- Income <= 81.50 | | | | | |--- Age <= 36.50 | | | | | | |--- Family <= 3.50 | | | | | | | |--- CCAvg <= 3.50 | | | | | | | | |--- Income <= 75.00 | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | | |--- Income > 75.00 | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | |--- CCAvg > 3.50 | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | |--- Family > 3.50 | | | | | | | |--- weights: [0.60, 0.00] class: 0 | | | | | |--- Age > 36.50 | | | | | | |--- County_Los Angeles County <= 0.50 | | | | | | | |--- CreditCard <= 0.50 | | | | | | | | |--- weights: [3.45, 0.00] class: 0 | | | | | | | |--- CreditCard > 0.50 | | | | | | | | |--- weights: [1.80, 0.00] class: 0 | | | | | | |--- County_Los Angeles County > 0.50 | | | | | | | |--- Experience <= 20.00 | | | | | | | | |--- Experience <= 17.50 | | | | | | | | | |--- weights: [0.30, 0.00] class: 0 | | | | | | | | |--- Experience > 17.50 | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | |--- Experience > 20.00 | | | | | | | | |--- weights: [1.05, 0.00] class: 0 | | | | |--- Income > 81.50 | | | | | |--- Mortgage <= 152.00 | | | | | | |--- County_Los Angeles County <= 0.50 | | | | | | | |--- Securities_Account <= 0.50 | | | | | | | | |--- County_Yolo County <= 0.50 | | | | | | | | | |--- Family <= 3.50 | | | | | | | | | | |--- Income <= 84.00 | | | | | | | | | | | |--- weights: [0.00, 5.10] class: 1 | | | | | | | | | | |--- Income > 84.00 | | | | | | | | | | | |--- truncated branch of depth 5 | | | | | | | | | |--- Family > 3.50 | | | | | | | | | | |--- County_Kern County <= 0.50 | | | | | | | | | | | |--- weights: [0.60, 0.00] class: 0 | | | | | | | | | | |--- County_Kern County > 0.50 | | | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | | |--- County_Yolo County > 0.50 | | | | | | | | | |--- Online <= 0.50 | | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | | |--- Online > 0.50 | | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | |--- Securities_Account > 0.50 | | | | | | | | |--- Experience <= 12.50 | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | |--- Experience > 12.50 | | | | | | | | | |--- weights: [0.45, 0.00] class: 0 | | | | | | |--- County_Los Angeles County > 0.50 | | | | | | | |--- Family <= 1.50 | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | |--- Family > 1.50 | | | | | | | | |--- weights: [0.60, 0.00] class: 0 | | | | | |--- Mortgage > 152.00 | | | | | | |--- County_Marin County <= 0.50 | | | | | | | |--- weights: [0.90, 0.00] class: 0 | | | | | | |--- County_Marin County > 0.50 | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | |--- CCAvg > 3.95 | | | | |--- weights: [6.75, 0.00] class: 0 | | |--- CD_Account > 0.50 | | | |--- CCAvg <= 4.50 | | | | |--- weights: [0.00, 6.80] class: 1 | | | |--- CCAvg > 4.50 | | | | |--- weights: [0.15, 0.00] class: 0 |--- Income > 98.50 | |--- Education <= 1.50 | | |--- Family <= 2.50 | | | |--- Income <= 100.00 | | | | |--- CCAvg <= 4.20 | | | | | |--- weights: [0.45, 0.00] class: 0 | | | | |--- CCAvg > 4.20 | | | | | |--- CCAvg <= 4.50 | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | |--- CCAvg > 4.50 | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | |--- Income > 100.00 | | | | |--- Income <= 103.50 | | | | | |--- Securities_Account <= 0.50 | | | | | | |--- weights: [2.10, 0.00] class: 0 | | | | | |--- Securities_Account > 0.50 | | | | | | |--- CreditCard <= 0.50 | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | |--- CreditCard > 0.50 | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | |--- Income > 103.50 | | | | | |--- Securities_Account <= 0.50 | | | | | | |--- weights: [57.90, 0.00] class: 0 | | | | | |--- Securities_Account > 0.50 | | | | | | |--- weights: [7.05, 0.00] class: 0 | | |--- Family > 2.50 | | | |--- Income <= 108.50 | | | | |--- Experience <= 3.50 | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | |--- Experience > 3.50 | | | | | |--- weights: [1.20, 0.00] class: 0 | | | |--- Income > 108.50 | | | | |--- Age <= 26.00 | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | |--- Age > 26.00 | | | | | |--- Income <= 113.50 | | | | | | |--- CCAvg <= 4.20 | | | | | | | |--- Income <= 112.00 | | | | | | | | |--- CreditCard <= 0.50 | | | | | | | | | |--- weights: [0.00, 2.55] class: 1 | | | | | | | | |--- CreditCard > 0.50 | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | |--- Income > 112.00 | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | |--- CCAvg > 4.20 | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | |--- Income > 113.50 | | | | | | |--- weights: [0.00, 41.65] class: 1 | |--- Education > 1.50 | | |--- Income <= 116.50 | | | |--- CCAvg <= 2.80 | | | | |--- Income <= 106.50 | | | | | |--- weights: [5.40, 0.00] class: 0 | | | | |--- Income > 106.50 | | | | | |--- Experience <= 31.50 | | | | | | |--- Experience <= 3.50 | | | | | | | |--- weights: [1.50, 0.00] class: 0 | | | | | | |--- Experience > 3.50 | | | | | | | |--- Family <= 3.50 | | | | | | | | |--- County_Santa Barbara County <= 0.50 | | | | | | | | | |--- Experience <= 4.50 | | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | | | |--- Experience > 4.50 | | | | | | | | | | |--- County_San Diego County <= 0.50 | | | | | | | | | | | |--- weights: [3.00, 0.00] class: 0 | | | | | | | | | | |--- County_San Diego County > 0.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | |--- County_Santa Barbara County > 0.50 | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | |--- Family > 3.50 | | | | | | | | |--- County_Santa Clara County <= 0.50 | | | | | | | | | |--- weights: [0.00, 3.40] class: 1 | | | | | | | | |--- County_Santa Clara County > 0.50 | | | | | | | | | |--- weights: [0.30, 0.00] class: 0 | | | | | |--- Experience > 31.50 | | | | | | |--- weights: [1.50, 0.00] class: 0 | | | |--- CCAvg > 2.80 | | | | |--- Age <= 63.50 | | | | | |--- County_Santa Barbara County <= 0.50 | | | | | | |--- County_Yolo County <= 0.50 | | | | | | | |--- Family <= 1.50 | | | | | | | | |--- Online <= 0.50 | | | | | | | | | |--- weights: [0.00, 1.70] class: 1 | | | | | | | | |--- Online > 0.50 | | | | | | | | | |--- weights: [0.45, 0.00] class: 0 | | | | | | | |--- Family > 1.50 | | | | | | | | |--- Income <= 99.50 | | | | | | | | | |--- Mortgage <= 250.75 | | | | | | | | | | |--- weights: [0.30, 0.00] class: 0 | | | | | | | | | |--- Mortgage > 250.75 | | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | | |--- Income > 99.50 | | | | | | | | | |--- Experience <= 35.50 | | | | | | | | | | |--- CreditCard <= 0.50 | | | | | | | | | | | |--- weights: [0.00, 12.75] class: 1 | | | | | | | | | | |--- CreditCard > 0.50 | | | | | | | | | | | |--- weights: [0.00, 3.40] class: 1 | | | | | | | | | |--- Experience > 35.50 | | | | | | | | | | |--- County_Santa Clara County <= 0.50 | | | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | | | |--- County_Santa Clara County > 0.50 | | | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | |--- County_Yolo County > 0.50 | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | |--- County_Santa Barbara County > 0.50 | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | |--- Age > 63.50 | | | | | |--- weights: [0.30, 0.00] class: 0 | | |--- Income > 116.50 | | | |--- County_Sacramento County <= 0.50 | | | | |--- weights: [0.00, 182.75] class: 1 | | | |--- County_Sacramento County > 0.50 | | | | |--- weights: [0.00, 5.95] class: 1
Observations:
Both Visual Tree and Text Spool being complex in nature, are difficult to interpret
# importance of features in the tree building ( The importance of a feature is computed as the
# (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance )
print(
pd.DataFrame(
model.feature_importances_, columns=["Imp"], index=X_train.columns
).sort_values(by="Imp", ascending=False)
)
Imp Income 5.904702e-01 Family 1.444360e-01 Education 1.262437e-01 CCAvg 7.944044e-02 Experience 1.350154e-02 CD_Account 1.099955e-02 Age 6.639435e-03 Securities_Account 4.976279e-03 Mortgage 4.251449e-03 County_Los Angeles County 3.773823e-03 County_San Diego County 3.065931e-03 County_Santa Barbara County 2.485045e-03 County_Santa Clara County 2.281170e-03 Online 2.013197e-03 County_Kern County 1.990057e-03 County_Yolo County 1.963808e-03 County_Orange County 7.469342e-04 CreditCard 7.213956e-04 County_Sacramento County 3.144373e-15 County_Marin County 1.253189e-17 County_Humboldt County 0.000000e+00 County_Siskiyou County 0.000000e+00 County_San Mateo County 0.000000e+00 County_Butte County 0.000000e+00 County_Santa Cruz County 0.000000e+00 County_Shasta County 0.000000e+00 County_Solano County 0.000000e+00 County_San Joaquin County 0.000000e+00 County_Sonoma County 0.000000e+00 County_Stanislaus County 0.000000e+00 County_Trinity County 0.000000e+00 County_Tuolumne County 0.000000e+00 County_Ventura 0.000000e+00 County_Ventura County 0.000000e+00 County_San Luis Obispo County 0.000000e+00 County_San Francisco 0.000000e+00 County_San Francisco County 0.000000e+00 County_Imperial County 0.000000e+00 County_Contra Costa County 0.000000e+00 County_San Bernardino County 0.000000e+00 County_San Benito County 0.000000e+00 County_Riverside County 0.000000e+00 County_Placer County 0.000000e+00 County_El Dorado County 0.000000e+00 County_Monterey County 0.000000e+00 County_Merced County 0.000000e+00 County_Mendocino County 0.000000e+00 County_Fresno County 0.000000e+00 County_Lake County 0.000000e+00 County_Fullerton 0.000000e+00 County_Irvine 0.000000e+00 County_Napa County 0.000000e+00
#plot a horizontal bar graph
importances = model.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
Observations:
According to the decision tree Income is the most important variable in predicting the possibility of Customer accepting a Personal Loan Offer.
# Choose the type of classifier.
estimator = DecisionTreeClassifier(random_state=1, class_weight={0: 0.15, 1: 0.85})
# Grid of parameters to choose from
parameters = {
"max_depth": [5, 10, 15, None],
"criterion": ["entropy", "gini"],
"splitter": ["best", "random"],
"min_impurity_decrease": [0.00001, 0.0001, 0.01],
}
# Type of scoring used to compare parameter combinations
scorer = make_scorer(recall_score)
# Run the grid search
grid_obj = GridSearchCV(estimator, parameters, scoring=scorer, cv=5)
grid_obj = grid_obj.fit(X_train, y_train)
# Set the clf to the best combination of parameters
estimator = grid_obj.best_estimator_
# Fit the best algorithm to the data.
estimator.fit(X_train, y_train)
DecisionTreeClassifier(class_weight={0: 0.15, 1: 0.85}, max_depth=5,
min_impurity_decrease=1e-05, random_state=1)
#print confusion matrix
confusion_matrix_sklearn(estimator, X_train, y_train)
#print recall score
decision_tree_tune_perf_train = get_recall_score(estimator, X_train, y_train)
print("Recall Score:", decision_tree_tune_perf_train)
Recall Score: 0.9667673716012085
#print confusion matrix
confusion_matrix_sklearn(estimator, X_test, y_test)
#print the recall score
decision_tree_tune_perf_test = get_recall_score(estimator, X_test, y_test)
print("Recall Score:", decision_tree_tune_perf_test)
Recall Score: 0.8926174496644296
Observations:
#plot the decision tree
plt.figure(figsize=(15, 10))
out = tree.plot_tree(
estimator,
feature_names=feature_names,
filled=True,
fontsize=9,
node_ids=False,
class_names=None,
)
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor("black")
arrow.set_linewidth(1)
plt.show()
# Text report showing the rules of a decision tree
print(tree.export_text(estimator, feature_names=feature_names, show_weights=True))
|--- Income <= 98.50 | |--- CCAvg <= 2.95 | | |--- weights: [374.10, 0.00] class: 0 | |--- CCAvg > 2.95 | | |--- CD_Account <= 0.50 | | | |--- CCAvg <= 3.95 | | | | |--- Income <= 81.50 | | | | | |--- weights: [7.35, 2.55] class: 0 | | | | |--- Income > 81.50 | | | | | |--- weights: [4.35, 9.35] class: 1 | | | |--- CCAvg > 3.95 | | | | |--- weights: [6.75, 0.00] class: 0 | | |--- CD_Account > 0.50 | | | |--- CCAvg <= 4.50 | | | | |--- weights: [0.00, 6.80] class: 1 | | | |--- CCAvg > 4.50 | | | | |--- weights: [0.15, 0.00] class: 0 |--- Income > 98.50 | |--- Education <= 1.50 | | |--- Family <= 2.50 | | | |--- Income <= 100.00 | | | | |--- CCAvg <= 4.20 | | | | | |--- weights: [0.45, 0.00] class: 0 | | | | |--- CCAvg > 4.20 | | | | | |--- weights: [0.00, 1.70] class: 1 | | | |--- Income > 100.00 | | | | |--- Income <= 103.50 | | | | | |--- weights: [2.25, 0.85] class: 0 | | | | |--- Income > 103.50 | | | | | |--- weights: [64.95, 0.00] class: 0 | | |--- Family > 2.50 | | | |--- Income <= 108.50 | | | | |--- Experience <= 3.50 | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | |--- Experience > 3.50 | | | | | |--- weights: [1.20, 0.00] class: 0 | | | |--- Income > 108.50 | | | | |--- Age <= 26.00 | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | |--- Age > 26.00 | | | | | |--- weights: [0.30, 45.05] class: 1 | |--- Education > 1.50 | | |--- Income <= 116.50 | | | |--- CCAvg <= 2.80 | | | | |--- Income <= 106.50 | | | | | |--- weights: [5.40, 0.00] class: 0 | | | | |--- Income > 106.50 | | | | | |--- weights: [6.45, 5.95] class: 0 | | | |--- CCAvg > 2.80 | | | | |--- Age <= 63.50 | | | | | |--- weights: [1.20, 19.55] class: 1 | | | | |--- Age > 63.50 | | | | | |--- weights: [0.30, 0.00] class: 0 | | |--- Income > 116.50 | | | |--- weights: [0.00, 188.70] class: 1
Observations:
# importance of features in the tree building ( The importance of a feature is computed as the
# (normalized) total reduction of the 'criterion' brought by that feature. It is also known as the Gini importance )
print(
pd.DataFrame(
estimator.feature_importances_, columns=["Imp"], index=X_train.columns
).sort_values(by="Imp", ascending=False)
)
# Here we will see that importance of features has increased
Imp Income 0.622927 Family 0.146723 Education 0.130659 CCAvg 0.082588 CD_Account 0.011659 Experience 0.002984 Age 0.002459 County_San Benito County 0.000000 County_Santa Clara County 0.000000 County_San Bernardino County 0.000000 County_San Diego County 0.000000 County_San Francisco 0.000000 County_San Francisco County 0.000000 County_San Joaquin County 0.000000 County_San Luis Obispo County 0.000000 County_San Mateo County 0.000000 County_Santa Barbara County 0.000000 County_Siskiyou County 0.000000 County_Santa Cruz County 0.000000 County_Shasta County 0.000000 County_Riverside County 0.000000 County_Solano County 0.000000 County_Sonoma County 0.000000 County_Stanislaus County 0.000000 County_Trinity County 0.000000 County_Tuolumne County 0.000000 County_Ventura 0.000000 County_Ventura County 0.000000 County_Sacramento County 0.000000 County_Napa County 0.000000 County_Placer County 0.000000 County_Orange County 0.000000 Mortgage 0.000000 Securities_Account 0.000000 Online 0.000000 CreditCard 0.000000 County_Butte County 0.000000 County_Contra Costa County 0.000000 County_El Dorado County 0.000000 County_Fresno County 0.000000 County_Fullerton 0.000000 County_Humboldt County 0.000000 County_Imperial County 0.000000 County_Irvine 0.000000 County_Kern County 0.000000 County_Lake County 0.000000 County_Los Angeles County 0.000000 County_Marin County 0.000000 County_Mendocino County 0.000000 County_Merced County 0.000000 County_Monterey County 0.000000 County_Yolo County 0.000000
#plot a horizontal bar graph
importances = estimator.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
Observations:
The DecisionTreeClassifier provides parameters such as min_samples_leaf and max_depth to prevent a tree from overfiting. Cost complexity pruning provides another option to control the size of a tree. In DecisionTreeClassifier, this pruning technique is parameterized by the cost complexity parameter, ccp_alpha. Greater values of ccp_alpha increase the number of nodes pruned. Here we only show the effect of ccp_alpha on regularizing the trees and how to choose a ccp_alpha based on validation scores.
Minimal cost complexity pruning recursively finds the node with the "weakest link". The weakest link is characterized by an effective alpha, where the nodes with the smallest effective alpha are pruned first. To get an idea of what values of ccp_alpha could be appropriate, scikit-learn provides DecisionTreeClassifier.cost_complexity_pruning_path that returns the effective alphas and the corresponding total leaf impurities at each step of the pruning process. As alpha increases, more of the tree is pruned, which increases the total impurity of its leaves.
#use decision tree classifier to get the CCP Path
clf = DecisionTreeClassifier(random_state=1, class_weight={0: 0.15, 1: 0.85})
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
pd.DataFrame(path)
| ccp_alphas | impurities | |
|---|---|---|
| 0 | 0.000000e+00 | -3.451500e-15 |
| 1 | 7.482671e-19 | -3.450751e-15 |
| 2 | 7.482671e-19 | -3.450003e-15 |
| 3 | 1.496534e-18 | -3.448506e-15 |
| 4 | 1.496534e-18 | -3.447010e-15 |
| 5 | 2.332833e-18 | -3.444677e-15 |
| 6 | 4.665666e-18 | -3.440011e-15 |
| 7 | 5.832082e-18 | -3.434179e-15 |
| 8 | 5.854090e-18 | -3.428325e-15 |
| 9 | 6.932475e-18 | -3.421393e-15 |
| 10 | 9.478050e-18 | -3.411915e-15 |
| 11 | 1.143528e-16 | -3.297562e-15 |
| 12 | 1.467352e-15 | -1.830210e-15 |
| 13 | 1.964954e-04 | 3.929907e-04 |
| 14 | 2.625571e-04 | 1.180662e-03 |
| 15 | 3.369896e-04 | 1.517652e-03 |
| 16 | 3.369896e-04 | 1.854641e-03 |
| 17 | 3.643130e-04 | 2.218954e-03 |
| 18 | 3.643130e-04 | 2.947580e-03 |
| 19 | 3.835686e-04 | 3.714718e-03 |
| 20 | 3.879017e-04 | 4.102619e-03 |
| 21 | 3.899406e-04 | 4.492560e-03 |
| 22 | 4.570292e-04 | 6.320677e-03 |
| 23 | 5.860688e-04 | 6.906745e-03 |
| 24 | 6.938264e-04 | 7.600572e-03 |
| 25 | 7.021026e-04 | 9.004777e-03 |
| 26 | 7.071864e-04 | 1.254071e-02 |
| 27 | 7.286261e-04 | 1.326934e-02 |
| 28 | 9.404360e-04 | 1.420977e-02 |
| 29 | 9.407728e-04 | 1.609132e-02 |
| 30 | 1.011155e-03 | 1.710247e-02 |
| 31 | 1.030992e-03 | 1.813346e-02 |
| 32 | 1.129211e-03 | 2.152110e-02 |
| 33 | 1.151207e-03 | 2.267230e-02 |
| 34 | 1.242301e-03 | 2.639920e-02 |
| 35 | 1.315081e-03 | 2.771429e-02 |
| 36 | 1.399934e-03 | 2.911422e-02 |
| 37 | 1.638043e-03 | 3.075226e-02 |
| 38 | 1.717019e-03 | 3.246928e-02 |
| 39 | 2.289245e-03 | 3.475853e-02 |
| 40 | 2.742431e-03 | 3.750096e-02 |
| 41 | 3.335999e-03 | 4.083696e-02 |
| 42 | 3.527226e-03 | 4.436418e-02 |
| 43 | 5.138280e-03 | 4.950246e-02 |
| 44 | 9.008434e-03 | 5.851090e-02 |
| 45 | 1.005479e-02 | 6.856568e-02 |
| 46 | 2.253222e-02 | 9.109791e-02 |
| 47 | 6.112098e-02 | 2.133399e-01 |
| 48 | 2.537957e-01 | 4.671356e-01 |
#subplot of effective alphas vs impurities
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(ccp_alphas[:-1], impurities[:-1], marker="o", drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
plt.show()
#train the tree
# The last value in ccp_alphas is the alpha value that prunes the whole tree, leaving the tree, `clfs[-1]`, with one node.
clfs = []
for ccp_alpha in ccp_alphas:
clf = DecisionTreeClassifier(
random_state=1, ccp_alpha=ccp_alpha, class_weight={0: 0.15, 1: 0.85}
)
clf.fit(X_train, y_train)
clfs.append(clf)
print(
"Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
clfs[-1].tree_.node_count, ccp_alphas[-1]
)
)
Number of nodes in the last tree is: 1 with ccp_alpha: 0.2537957148948091
#plot graphs of Nodes VS Alpha and Depth VS Alpha
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
fig, ax = plt.subplots(2, 1, figsize=(10, 7))
ax[0].plot(ccp_alphas, node_counts, marker="o", drawstyle="steps-post")
ax[0].set_xlabel("alpha")
ax[0].set_ylabel("number of nodes")
ax[0].set_title("Number of nodes vs alpha")
ax[1].plot(ccp_alphas, depth, marker="o", drawstyle="steps-post")
ax[1].set_xlabel("alpha")
ax[1].set_ylabel("depth of tree")
ax[1].set_title("Depth vs alpha")
fig.tight_layout()
plt.show()
# lis of recall on training set
recall_train = []
for clf in clfs:
pred_train = clf.predict(X_train)
values_train = recall_score(y_train, pred_train)
recall_train.append(values_train)
#list of recall on testing set
recall_test = []
for clf in clfs:
pred_test = clf.predict(X_test)
values_test = recall_score(y_test, pred_test)
recall_test.append(values_test)
train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]
#plot of recall for Training and Testing Set
fig, ax = plt.subplots(figsize=(15, 5))
ax.set_xlabel("alpha")
ax.set_ylabel("Recall")
ax.set_title("Recall vs alpha for training and testing sets")
ax.plot(
ccp_alphas, recall_train, marker="o", label="train", drawstyle="steps-post",
)
ax.plot(ccp_alphas, recall_test, marker="o", label="test", drawstyle="steps-post")
ax.legend()
plt.show()
Observations:
# creating the model where we get highest train and test recall
index_best_model = np.argmax(recall_test)
best_model = clfs[index_best_model]
print(best_model)
DecisionTreeClassifier(ccp_alpha=0.009008434301508085,
class_weight={0: 0.15, 1: 0.85}, random_state=1)
#fit the model
best_model.fit(X_train, y_train)
DecisionTreeClassifier(ccp_alpha=0.009008434301508085,
class_weight={0: 0.15, 1: 0.85}, random_state=1)
#draw confusion matrix
confusion_matrix_sklearn(best_model, X_train, y_train)
#print recall score
print("Recall Score:", get_recall_score(best_model, X_train, y_train))
Recall Score: 0.9909365558912386
#draw confusion matrix
confusion_matrix_sklearn(best_model, X_test, y_test)
#print the recall score
print("Recall Score:", get_recall_score(best_model, X_test, y_test))
Recall Score: 0.9865771812080537
#plot the decision tree
plt.figure(figsize=(5, 5))
out = tree.plot_tree(
best_model,
feature_names=feature_names,
filled=True,
fontsize=9,
node_ids=False,
class_names=None,
)
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor("black")
arrow.set_linewidth(1)
plt.show()
# importance of features in the tree building ( The importance of a feature is computed as the
# (normalized) total reduction of the 'criterion' brought by that feature. It is also known as the Gini importance )
print(
pd.DataFrame(
best_model.feature_importances_, columns=["Imp"], index=X_train.columns
).sort_values(by="Imp", ascending=False)
)
#Importance of variables is improved further
Imp Income 0.645704 Family 0.158240 Education 0.140915 CCAvg 0.055142 Age 0.000000 County_Santa Clara County 0.000000 County_Sacramento County 0.000000 County_San Benito County 0.000000 County_San Bernardino County 0.000000 County_San Diego County 0.000000 County_San Francisco 0.000000 County_San Francisco County 0.000000 County_San Joaquin County 0.000000 County_San Luis Obispo County 0.000000 County_San Mateo County 0.000000 County_Santa Barbara County 0.000000 County_Shasta County 0.000000 County_Santa Cruz County 0.000000 County_Placer County 0.000000 County_Siskiyou County 0.000000 County_Solano County 0.000000 County_Sonoma County 0.000000 County_Stanislaus County 0.000000 County_Trinity County 0.000000 County_Tuolumne County 0.000000 County_Ventura 0.000000 County_Ventura County 0.000000 County_Riverside County 0.000000 County_Napa County 0.000000 County_Orange County 0.000000 County_Fullerton 0.000000 Mortgage 0.000000 Securities_Account 0.000000 CD_Account 0.000000 Online 0.000000 CreditCard 0.000000 County_Butte County 0.000000 County_Contra Costa County 0.000000 County_El Dorado County 0.000000 County_Fresno County 0.000000 County_Humboldt County 0.000000 Experience 0.000000 County_Imperial County 0.000000 County_Irvine 0.000000 County_Kern County 0.000000 County_Lake County 0.000000 County_Los Angeles County 0.000000 County_Marin County 0.000000 County_Mendocino County 0.000000 County_Merced County 0.000000 County_Monterey County 0.000000 County_Yolo County 0.000000
#plot a horizontal bar graph
importances = best_model.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
Observations:
#another best modekl with ccp_alphase as 0.023
best_model2 = DecisionTreeClassifier(
ccp_alpha=0.023, class_weight={0: 0.15, 1: 0.85}, random_state=1
)
#fit the model
best_model2.fit(X_train, y_train)
DecisionTreeClassifier(ccp_alpha=0.023, class_weight={0: 0.15, 1: 0.85},
random_state=1)
#print the confusion matrix
confusion_matrix_sklearn(best_model2, X_train, y_train)
#print the recall score
decision_tree_postpruned_perf_train = get_recall_score(best_model2, X_train, y_train)
print("Recall Score:", decision_tree_postpruned_perf_train)
Recall Score: 0.9244712990936556
#draw confusion matrix
confusion_matrix_sklearn(best_model2, X_test, y_test)
#print recall score
decision_tree_postpruned_perf_test = get_recall_score(best_model2, X_test, y_test)
print("Recall Score:", decision_tree_postpruned_perf_test)
Recall Score: 0.8926174496644296
Observations:
#plot the decision tree
plt.figure(figsize=(15, 10))
out = tree.plot_tree(
best_model2,
feature_names=feature_names,
filled=True,
fontsize=9,
node_ids=False,
class_names=None,
)
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor("black")
arrow.set_linewidth(1)
plt.show()
# Text report showing the rules of a decision tree -
print(tree.export_text(best_model2, feature_names=feature_names, show_weights=True))
|--- Income <= 98.50 | |--- weights: [392.70, 18.70] class: 0 |--- Income > 98.50 | |--- Education <= 1.50 | | |--- Family <= 2.50 | | | |--- weights: [67.65, 2.55] class: 0 | | |--- Family > 2.50 | | | |--- weights: [1.65, 45.90] class: 1 | |--- Education > 1.50 | | |--- weights: [13.35, 214.20] class: 1
# importance of features in the tree building ( The importance of a feature is computed as the
# (normalized) total reduction of the 'criterion' brought by that feature. It is also known as the Gini importance )
print(
pd.DataFrame(
best_model2.feature_importances_, columns=["Imp"], index=X_train.columns
).sort_values(by="Imp", ascending=False)
)
Imp Income 0.674921 Family 0.171953 Education 0.153126 Age 0.000000 County_Santa Clara County 0.000000 County_Sacramento County 0.000000 County_San Benito County 0.000000 County_San Bernardino County 0.000000 County_San Diego County 0.000000 County_San Francisco 0.000000 County_San Francisco County 0.000000 County_San Joaquin County 0.000000 County_San Luis Obispo County 0.000000 County_San Mateo County 0.000000 County_Santa Barbara County 0.000000 County_Shasta County 0.000000 County_Santa Cruz County 0.000000 County_Placer County 0.000000 County_Siskiyou County 0.000000 County_Solano County 0.000000 County_Sonoma County 0.000000 County_Stanislaus County 0.000000 County_Trinity County 0.000000 County_Tuolumne County 0.000000 County_Ventura 0.000000 County_Ventura County 0.000000 County_Riverside County 0.000000 County_Napa County 0.000000 County_Orange County 0.000000 Experience 0.000000 CCAvg 0.000000 Mortgage 0.000000 Securities_Account 0.000000 CD_Account 0.000000 Online 0.000000 CreditCard 0.000000 County_Butte County 0.000000 County_Contra Costa County 0.000000 County_El Dorado County 0.000000 County_Fresno County 0.000000 County_Fullerton 0.000000 County_Humboldt County 0.000000 County_Imperial County 0.000000 County_Irvine 0.000000 County_Kern County 0.000000 County_Lake County 0.000000 County_Los Angeles County 0.000000 County_Marin County 0.000000 County_Mendocino County 0.000000 County_Merced County 0.000000 County_Monterey County 0.000000 County_Yolo County 0.000000
#horizontal bar graph
importances = best_model2.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
Observations:
Income, Education and Family are three important parameters that would influence the customers willingness to accept Personal Loan offer.
# training performance comparison
models_train_comp_df = pd.DataFrame(
[
decision_tree_perf_train,
decision_tree_tune_perf_train,
decision_tree_postpruned_perf_train,
],
columns=["Recall on training set"],
)
models_train_comp_df.index = [
"Recall on Default Decision Tree",
"Recall on Tuned Decision Tree",
"Recall on Post Pruned Decision Tree"
]
print("Training performance comparison:")
models_train_comp_df
Training performance comparison:
| Recall on training set | |
|---|---|
| Recall on Default Decision Tree | 1.000000 |
| Recall on Tuned Decision Tree | 0.966767 |
| Recall on Post Pruned Decision Tree | 0.924471 |
# testing performance comparison
models_test_comp_df = pd.DataFrame(
[
decision_tree_perf_test,
decision_tree_tune_perf_test,
decision_tree_postpruned_perf_test,
],
columns=["Recall on testing set"],
)
models_test_comp_df.index = [
"Recall on Default Decision Tree",
"Recall on Tuned Decision Tree",
"Recall on Post Pruned Decision Tree"
]
print("Test performance comparison:")
models_test_comp_df
Test performance comparison:
| Recall on testing set | |
|---|---|
| Recall on Default Decision Tree | 0.845638 |
| Recall on Tuned Decision Tree | 0.892617 |
| Recall on Post Pruned Decision Tree | 0.892617 |
Observations:
Personal Loan:
Other Products and Services:
# Module : Supervised Learning Classification
# Project: loan-modelling - supervised learning
# Submitted by : Ritesh Sharma
# Submission Date : 04 Sep 2021